In [1]:
import bamboolib as bam
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as mpl
C:\Users\Rodrigo\Anaconda3\lib\site-packages\statsmodels\tools\_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [2]:
df = pd.read_csv("Datasets/D1-1.csv")
In [3]:
df = df.loc[~(df['race'].isin(['Overall_totals']))]
df = df.loc[~(df['race'].isna())]
df = df.loc[~(df['job_category'].isin(['Previous_totals']))]
df['countInt'] = pd.to_numeric(df['count'], downcast='integer', errors='coerce')
df_totals = df.loc[df['job_category'].isin(['Totals'])]
tmp_groupby_df = df_totals.groupby(['company']).agg(**{"'countInt_sum'": ('countInt', 'sum')}).reset_index()
df_totals_1 = df_totals.merge(tmp_groupby_df, on=['company'])
df_company_totals = df_totals_1[['company', "'countInt_sum'"]]
df_company_totals = df_company_totals.drop_duplicates(subset=None, keep='first', inplace=False)
df_company_totals
In [4]:
df_company_totals = df_company_totals.rename(columns={'company': 'company', "'countInt_sum'": 'countInt_sum'})
In [5]:
df['percentage'] =""

#df.loc[df.company == df_company_totals.company, 'percentage'] = df_company_totals.loc['countInt_sum']

for index in df.index:
    for company in df_company_totals.index:
        if df_company_totals.loc[company,'company'] == df.loc[index,'company']:
            df.loc[index,'percentage'] = df.loc[index,'countInt'].item()/df_company_totals.loc[company,'countInt_sum'].item()

df.loc[df['job_category'].isin(['Executive/Senior officials & Mgrs']), 'job_category'] = 'Executives'
df.loc[df['job_category'].isin(['First/Mid officials & Mgrs']), 'job_category'] = 'Managers'
df
df_pipo = df.groupby(['company']).agg(**{"'countInt_sum'": ('countInt', 'sum')}).reset_index()
df
In [6]:
df1 = pd.read_csv("Datasets/D2.csv")
df1['Female'] = df1.iloc[:,4]
df1['Male'] = df1.iloc[:,6]
df1['White'] = df1.iloc[:,9]
df1['Asian'] = df1.iloc[:,11]
df1['Latino'] = df1.iloc[:,13]
df1['Black'] = df1.iloc[:,15]
df1['Multi'] = df1.iloc[:,17]
df1['Other'] = df1.iloc[:,19]

df1 = df1.drop(columns=['%', '%.1', '%.2', '%.3', '%.4', '%.5', '%.6', '%.7', '%.8', 'Undeclared', 'Unnamed: 21'])
df1
In [7]:
missing_values_count = df.isnull().sum()
print (missing_values_count)
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)
company         0
year            0
race            0
gender          0
job_category    0
count           0
countInt        0
percentage      0
dtype: int64
% of missing data =  0.0
In [8]:
racegroups = df.groupby(["race"])
a = racegroups.get_group("Asian")
w= racegroups.get_group("White")
b= racegroups.get_group("Black_or_African_American")
h= racegroups.get_group("Hispanic_or_Latino")
t= racegroups.get_group("Two_or_more_races")
hw = racegroups.get_group("Native_Hawaiian_or_Pacific_Islander")
ind= racegroups.get_group("American_Indian_Alaskan_Native")
na = a["countInt"].sum()
nw = w["countInt"].sum()
nb= b["countInt"].sum()
nh= h["countInt"].sum()
nt= t["countInt"].sum()
nhw= hw["countInt"].sum()
nind= ind["countInt"].sum()
In [9]:
companies = df.groupby(['company'])
In [10]:
print("Race group distributions in dataset")
labels = 'Asian', 'White', 'Black_or_African_American', 'Hispanic_or_Latino', 'Two_or_more_races', 'Native_Hawaiian_or_Pacific_Islander', ' '
sizes = [na,nw,nb,nh,nt,nhw,nind]
fig1, ax1 = mpl.subplots(figsize=(8, 8))

explode = (0,0,0,0,0,0.1,0)
ax1.pie(sizes, explode=explode,labels=labels, autopct ='%1.1f%%', shadow=False, startangle=90)
ax1.axis('equal')

mpl.show()


import plotly.express as px

## bar chart on companies
print("Race distributions per company")
fig = px.histogram(df, x='company', y='countInt', color='race')
fig.update_yaxes(title_text='number of employees')
fig
Race group distributions in dataset
Race distributions per company
In [11]:
import plotly.express as px
print("Race Distributions by job category")
fig = px.histogram(df, x='countInt', y='job_category', color='race')
fig
Race Distributions by job category
In [12]:
#Latinos
from plotly.subplots import make_subplots
import plotly.graph_objects as go


dfLatino = df.loc[df["race"].isin(["Hispanic_or_Latino"])]
dfLatino = dfLatino.loc[~(dfLatino['job_category'].isin(['Totals']))]
#dfLatino


dfWhite= df.loc[df["race"].isin(["White"])]
dfWhite = dfWhite.loc[~(dfWhite['job_category'].isin(['Totals']))]
#dfWhite

dfBlack= df.loc[df["race"].isin(["Black_or_African_American"])]
dfBlack = dfBlack.loc[~(dfBlack['job_category'].isin(['Totals']))]

dfAsian= df.loc[df["race"].isin(["Asian"])]
dfAsian = dfAsian.loc[~(dfAsian['job_category'].isin(['Totals']))]

dfHawai= df.loc[df["race"].isin(["Native_Hawaiian_or_Pacific_Islander"])]
dfHawai = dfHawai.loc[~(dfHawai['job_category'].isin(['Totals']))]

dfIndian= df.loc[df["race"].isin(["American_Indian_Alaskan_Native"])]
dfIndian = dfIndian.loc[~(dfIndian['job_category'].isin(['Totals']))]

dfTwo= df.loc[df["race"].isin(["Two_or_more_races"])]
dfTwo = dfTwo.loc[~(dfTwo['job_category'].isin(['Totals']))]

fig = make_subplots(rows = 4,cols = 2, start_cell="bottom-left", subplot_titles = ("Latino", "White", "Black","Asian",
                                                                                  "Native Hawaiian", "American Indian", "Two or More races"))

fig.add_trace(
    go.Bar(x= dfLatino.job_category, y=dfLatino.countInt), row= 1,col=1)

fig.add_trace(
    go.Bar(x=dfWhite.job_category, y=dfWhite.countInt),row=1,col=2)
fig.add_trace(
    go.Bar(x=dfBlack.job_category, y=dfBlack.countInt),row=2,col=1)
fig.add_trace(
    go.Bar(x=dfAsian.job_category, y=dfAsian.countInt),row=2,col=2)
fig.add_trace(
    go.Bar(x=dfHawai.job_category, y=dfHawai.countInt),row=3,col=1)
fig.add_trace(
    go.Bar(x=dfIndian.job_category, y=dfIndian.countInt),row=3,col=2)
fig.add_trace(
    go.Bar(x=dfTwo.job_category, y=dfTwo.countInt),row=4,col=1)

fig.update_layout(height = 1500, width = 900, title_text = "Job Categories by Race Groups")
fig.show()
In [13]:
print("Gender representation by race")
fig0 = px.histogram(df, x='race', y='countInt', color='gender', template='plotly_white')
fig0.update_yaxes(title_text='Number of employees')
fig0
Gender representation by race
In [14]:
## pie chart on genders
print("Gender distribution in dataset")
genders =  df.groupby(["gender"])
male= genders.get_group("male")
female= genders.get_group("female")
nmale = male["countInt"].sum()
nfemale = female["countInt"].sum()
labels = 'male', 'female'
sizes = [nmale,nfemale]
fig1, ax1 = mpl.subplots(figsize=(6, 6))

explode = (0,0)
ax1.pie(sizes, explode=explode,labels=labels, autopct ='%1.1f%%', shadow=False, startangle=90)
ax1.axis('equal')

mpl.show()
print("numer of males:", nmale)
print("numer of females:", nfemale)
Gender distribution in dataset
numer of males: 602886
numer of females: 268250

Creating Attributes to use in Machine Learning

In order to achieve this a number of computations using Shannon-Wiener Diversity Index will be calculated and will generate flexible information that can be adjusted to other datasets even when the diversity data may differ to the peresented here. The formula can automatically adjust the Eveness measure for datasets that identify different ethnic groups. The formula used for this is the following:

\begin{equation}H = - \sum Pi(\ln Pi) \end{equation}

Where Pi is the proportion of each "group" in the sample. Once we find H which is the shannon diversity index, we can calculate (E) Evenness which will normalize the index into a more interpretable measure which ranges from 0 to lowest level of equal distribution accross groups to 1 which is perfect equitable distribution accross all groups. Evenness can be calculated as follows:

\begin{equation}E_H = \frac{ H }{H_{max}} \end{equation}

Where H max is logarithm of the total number of groups.

In [15]:
#Executives
df_execs = df.loc[df['job_category'].isin(['Executives'])]
tmp_groupby_df = df_execs.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()

#Managers
df_managers = df.loc[df['job_category'].isin(['Managers'])]
mtmp_groupby_df = df_managers.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()

#Professionals
df_professionals = df.loc[df['job_category'].isin(['Professionals'])]
ptmp_groupby_df = df_professionals.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()


#Other employees

df_others = df.loc[df['job_category'].isin(['Technicians', 'Sales workers', 'Administrative support', 'Craft workers', 'operatives', 'laborers and helpers', 'Service workers'])]
otmp_groupby_df = df_others.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()


#total employees
df_totals = df
ttmp_groupby_df = df_totals.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()
 
In [ ]:
 
In [ ]:
 
In [16]:
# Calculating each categorie's entropy as shannon index
def Sgenerator(yourDF,tempDF):
    yourDF["S"]=""
    for item in tempDF.index:
        for index in  yourDF.index:
            if yourDF.loc[index,'company'] == tempDF.loc[item,'company']:
                yourDF.loc[index,'S'] = tempDF.loc[item,'countInt_sum']
def piGenerator(yourDF):
    yourDF['pi'] = yourDF['countInt']/yourDF['S']
    yourDF['lnPi'] =np.where(yourDF['pi'] !=0.0,np.log(yourDF['pi'].astype(np.float64)),0) 
    yourDF['piXlnPi'] = yourDF['pi'] * yourDF['lnPi']
#EXECUTIVES
Sgenerator(df_execs,tmp_groupby_df)
piGenerator(df_execs)
#MANAGERS
Sgenerator(df_managers,mtmp_groupby_df)
piGenerator(df_managers)
#PROFESSIONALS
Sgenerator(df_professionals,ptmp_groupby_df)
piGenerator(df_professionals)
#OTHER EMPLOYEES
Sgenerator(df_others,otmp_groupby_df)
piGenerator(df_others)         
#TOTALEMPLOYEES
Sgenerator(df_totals,ttmp_groupby_df)
piGenerator(df_totals) 
In [ ]:
 
In [17]:
#grouping by gender 
#EXECUTIVES
            
df_execs_gender = df_execs.groupby(['company', 'gender']).agg(**{'countInt': ('countInt', 'sum')}).reset_index()
Sgenerator(df_execs_gender,tmp_groupby_df)
piGenerator(df_execs_gender)
#MANAGERS
df_managers_gender = df_managers.groupby(['company', 'gender']).agg(**{'countInt': ('countInt', 'sum')}).reset_index()
Sgenerator(df_managers_gender,mtmp_groupby_df)
piGenerator(df_managers_gender)
#TOTAL
df_totals_gender = df_totals.groupby(['company', 'gender']).agg(**{'countInt': ('countInt', 'sum')}).reset_index()
Sgenerator(df_totals_gender,ttmp_groupby_df)
piGenerator(df_totals_gender)
In [18]:
#calculating H Hmax and Eveness into tables
def calcH(yourDF):
    df_execs_num_cats = yourDF.groupby(['company']).agg(number_caegories=('job_category', 'size')).reset_index()
    df_execs_num_cats['Hmax'] = np.log(df_execs_num_cats['number_caegories'])

    yourDF_H = yourDF.groupby(['company']).agg(H=('piXlnPi', 'sum')).reset_index()
    yourDF_H['Hmax'] = df_execs_num_cats['Hmax']
    yourDF_H['H+'] = yourDF_H['H'] * -1
    yourDF_H['E'] = yourDF_H['H+']/yourDF_H['Hmax']
    yourDF_H = yourDF_H.reset_index()
    return yourDF_H
#EXECUTIVES
df_execs_H = calcH(df_execs)
#MANAGERS
df_managers_H = calcH(df_managers)
#PROFESSIONALS
df_professionals_H = calcH(df_professionals)
#OTHER EMPLOYEES
df_others_H = calcH(df_others)
#TOTAL EMPLOYEES
df_totals_H = calcH(df_totals)
In [19]:
#calculating H, Hmas and Evenness into tables for gender data
def calcHgender(yourDF):
    df_execs_num_cats = yourDF.groupby(['company']).agg(number_caegories=('gender', 'size')).reset_index()
    df_execs_num_cats['Hmax'] = np.log(df_execs_num_cats['number_caegories'])

    yourDF_H = yourDF.groupby(['company']).agg(H=('piXlnPi', 'sum')).reset_index()
    yourDF_H['Hmax'] = df_execs_num_cats['Hmax']
    yourDF_H['H+'] = yourDF_H['H'] * -1
    yourDF_H['E'] = yourDF_H['H+']/yourDF_H['Hmax']
    yourDF_H = yourDF_H.reset_index()
    return yourDF_H
df_execs_gender_H = calcHgender(df_execs_gender)
df_managers_gender_H = calcHgender(df_managers_gender)
df_totals_gender_H = calcHgender(df_totals_gender)
In [ ]:

In [ ]:
 
In [20]:
x_comp = pd.DataFrame(columns=['company','year','Eexec','EManager','EProfessional','EOther','ETotal','GenderExec','GenderManagement','GenderTotal'])
x_comp = x_comp.reset_index()
x_comp['company'] = df_totals_H['company']
x_comp['year'] = 2016
x_comp['Eexec'] = df_execs_H['E']
x_comp['EManager'] = df_managers_H['E']
x_comp['EProfessional'] = df_professionals_H['E']
x_comp['EOther'] = df_others_H['E']
x_comp['ETotal'] = df_totals_H['E']
x_comp['GenderExec'] = df_execs_gender_H['E']
x_comp['GenderManagement'] = df_managers_gender_H['E']
x_comp['GenderTotal'] = df_totals_gender_H['E']
In [21]:
x_comp
In [22]:
import plotly.express as px
fig = px.line(x_comp.sort_values(by=['company'], ascending=[True]), x='company', y=['Eexec', 'EManager', 'EProfessional', 'EOther', 'ETotal'], line_dash_sequence=['dot'])
fig.update_layout(xaxis_rangeslider_visible=True)
fig
In [23]:
fig = px.bar(x_comp, x='company', y='GenderTotal', color='GenderTotal', color_continuous_scale='pinkyl')
fig
In [24]:
fig = px.scatter_3d(x_comp, x='company', y='GenderExec', z='Eexec', color='Eexec', title='Executive Level Diversity and Gender Matrix')
fig.update_yaxes(title_text='Gender Balance')
xaxis=dict(
    tickvals=x_comp['company'],
    tickmode = 'linear',
    title='Companies',
    titlefont=dict(
        family='Courier New, monospace',
        size=18,
        color='#7f7f7f'
    )
)
fig.update_xaxes(dtick=6)
fig
In [25]:
fig = px.scatter_3d(x_comp, x='company', y='GenderManagement', z='EManager', color='EManager', title='Management Level Diversity and Gender Matrix')
fig.update_yaxes(title_text='Gender Balance')
fig.update_xaxes(dtick=6)
fig

Financial Data from the SEC

Financial records from publicly traded firms are available through EDGAR database. The financial statements for the companies in our datasets have been extracted from EDGAR and analysed using the following methodologies. The financial analysis per company will include metrics for all areas of the business:

Activity ratios

Working Capital Turnover: the result of \begin{equation}\frac{ Revenue }{average\, working \, capital}\end{equation} Total Assets Turnover: the result of \begin{equation}\frac{ Revenue }{average \, Fixed\,Assets}\end{equation}

Liquidity ratios

Current ratio: the result of \begin{equation}\frac{ Current\,assets }{Current\,liabilities}\end{equation}

Solvency:

Debt-to-Assets: \begin{equation}\frac{ Total\,debt }{Total\,assets}\end{equation} Interest Coverage: \begin{equation}\frac{EBIT}{Interest\,payments}\end{equation}

Profitability

Net profit margin: \begin{equation}\frac{ Net income }{Revenue}\end{equation} Operating margin: \begin{equation}\frac{ Operating profit }{Revenue}\end{equation} ROE: \begin{equation}\frac{ Net\, income }{Average\, total\,equity}\end{equation}

Valuation

P/E: \begin{equation}\frac{ Price\,per\,share }{Earnings\,per\,share}\end{equation} P/BV: \begin{equation}\frac{ Price\,per\,share}{Book\,value\,per\,share}\end{equation}

Leverage

Financial Leverage: \begin{equation}\frac{ Average\,total\,assets}{Total\,shareholders'\,equity}\end{equation}

Performance Ratios

Cash flow to revenue: \begin{equation}\frac{CFO}{Revenue}\end{equation} Debt payment: \begin{equation}\frac{CFO}{Cash\,paid\,for\,long\,term\,debt\,repayment}\end{equation} Investing & Financing: \begin{equation}\frac{CFO}{Cash\,outflows\,for\,investing\,and\,financing\,activities}\end{equation}

In [26]:
Financialsdf = pd.DataFrame(columns = ['company,','year','WCturnover','TAturnover','currentRatio','quickRatio', 
                                       'DebtToAssets','InterestCoverage', 'NetProfitMargin','operatingMargin','ROE','P/E',
                                      'P/BV','FinancialLeverage','PerformanceRatio','CashflowToRevenue','DebtPayment','Investing&Financing'])

Financialsdf = Financialsdf.reset_index()
Financialsdf
In [27]:
df_financials = pd.read_excel("Datasets/Financial/AllCompanies_Financials.xlsx")
In [28]:
df_financials["WorkingCapitalTurnover"] = df_financials['Revenue']/df_financials['Working Capital Current']
df_financials["Asset Turnover"] = df_financials['Revenue']/df_financials['Total Assets']
df_financials["current ratio"] = df_financials['Current Assets']/df_financials['Current Liabilitites']
df_financials["Solvency"] = df_financials['Total Liabilities']/df_financials['Total Assets']
df_financials["Net Profit Margin"] = df_financials['Net Income']/df_financials['Revenue']
df_financials["operating margin"] = df_financials['Operating Profit']/df_financials['Revenue']
df_financials["ROE"] = df_financials['Net Income']/df_financials['Shareholders Equity']
df_financials["Leverage"] = df_financials['Total Assets']/df_financials['Shareholders Equity']
df_financials["Cashflow-Revenue"] = df_financials['CashFlow']/df_financials['Revenue']
df_financial_ratios = df_financials[['Company', 'Year', 'WorkingCapitalTurnover', 'Asset Turnover', 'current ratio', 'Solvency', 'Net Profit Margin', 'operating margin', 'ROE', 'Leverage', 'Cashflow-Revenue']]
df_financial_ratios
In [29]:
df_corr_explor = pd.merge(df_financial_ratios, x_comp, how='left', left_on=['Company'], right_on=['company'])
df_corr_explor = df_corr_explor.drop(columns=['year', 'company', 'index'])
df_corr_explor
In [30]:
import plotly.express as px
fig = px.scatter(df_corr_explor.dropna(subset=['current ratio']), x='Eexec', y='current ratio', symbol_sequence=['square-open'], color='Eexec', trendline='ols', hover_name='Company')
fig
In [31]:
fig = px.scatter(df_corr_explor.dropna(subset=['current ratio']), x='EManager', y='current ratio', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
In [32]:
fig = px.scatter(df_corr_explor.dropna(subset=['Net Profit Margin']), x='EManager', y='Net Profit Margin', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
In [33]:
fig = px.scatter(df_corr_explor.dropna(subset=['operating margin']), x='EManager', y='operating margin', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
In [34]:
fig = px.scatter(df_corr_explor.dropna(subset=['operating margin']), x='GenderManagement', y='operating margin', color='GenderManagement', symbol_sequence=['square-open'], trendline='ols',hover_name='Company', color_continuous_scale='pinkyl')
fig
In [35]:
fig = px.scatter(df_corr_explor.dropna(subset=['Asset Turnover']), x='ETotal', y='Asset Turnover', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
In [36]:
fig = px.scatter(df_corr_explor.dropna(subset=['Asset Turnover']), x='ETotal', y='Asset Turnover', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
In [37]:
fig = px.scatter(df_corr_explor.dropna(subset=['ROE']), x='ETotal', color='EManager', symbol_sequence=['square-open'], trendline='ols', y='ROE',hover_name='Company')
fig
In [38]:
fig = px.scatter(df_corr_explor.dropna(subset=['Net Profit Margin']), x='GenderExec', color='EManager', symbol_sequence=['square'], trendline='ols', y='Net Profit Margin',hover_name='Company', color_continuous_scale='pinkyl')
fig
In [39]:
fig = px.scatter(df_corr_explor.dropna(subset=['operating margin']), x='GenderExec', color='EManager', symbol_sequence=['square'], trendline='ols', y='operating margin',hover_name='Company', color_continuous_scale='pinkyl')
fig
In [40]:
fig = px.scatter(df_corr_explor.dropna(subset=['operating margin']), x='GenderManagement', color='EManager', symbol_sequence=['square'], trendline='ols', y='operating margin',hover_name='Company', color_continuous_scale='pinkyl')
fig
In [41]:
fig = px.scatter(df_corr_explor.dropna(subset=['Cashflow-Revenue']), x='GenderManagement', color='EManager', symbol_sequence=['square'], trendline='ols', y='Cashflow-Revenue',hover_name='Company', color_continuous_scale='pinkyl')
fig
In [42]:
fig = px.scatter(df_corr_explor.dropna(subset=['Cashflow-Revenue']), x='GenderTotal', color='EManager', symbol_sequence=['square'], trendline='ols', y='Cashflow-Revenue',hover_name='Company', color_continuous_scale='pinkyl')
fig
In [ ]: